This notebook presents the exploratory data analysis (EDA) for the Kaggle Playground Series - Season 5 Episode 2 (S5E2), which aims to predict backpack prices based on product attributes.
# Read the training and test datasets
train <- read.csv("train.csv")
test <- read.csv("test.csv")
# Check the structure of the training data
str(train)
## 'data.frame': 300000 obs. of 11 variables:
## $ id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ Brand : chr "Jansport" "Jansport" "Under Armour" "Nike" ...
## $ Material : chr "Leather" "Canvas" "Leather" "Nylon" ...
## $ Size : chr "Medium" "Small" "Small" "Small" ...
## $ Compartments : num 7 10 2 8 1 10 3 1 8 2 ...
## $ Laptop.Compartment : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Waterproof : chr "No" "Yes" "No" "No" ...
## $ Style : chr "Tote" "Messenger" "Messenger" "Messenger" ...
## $ Color : chr "Black" "Green" "Red" "Green" ...
## $ Weight.Capacity..kg.: num 11.6 27.1 16.6 12.9 17.7 ...
## $ Price : num 112.2 68.9 39.2 80.6 86 ...
# Summary statistics for the target variable
summary(train$Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 47.38 80.96 81.41 115.02 150.00
# Check for missing values in all columns
colSums(is.na(train))
## id Brand Material
## 0 0 0
## Size Compartments Laptop.Compartment
## 0 0 0
## Waterproof Style Color
## 0 0 0
## Weight.Capacity..kg. Price
## 138 0
# Plot the distribution of backpack prices
hist(train$Price,
breaks = 50,
main = "Distribution of Backpack Prices",
xlab = "Price")
# Log-transformed price distribution
hist(log1p(train$Price),
breaks = 50,
col = "gray",
main = "Log-Transformed Distribution of Backpack Prices",
xlab = "Log(1 + Price)")
# Sample a subset for clearer visualization
set.seed(42)
train_sample <- train[sample(nrow(train), 5000), ]
ggplot(train_sample, aes(x = `Weight.Capacity..kg.`, y = Price)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
ggtitle("Weight Capacity vs. Price (Sampled 5k)")
# Numeric Features vs. Price
numeric_cols <- train %>% select(where(is.numeric))
# Calculate correlation with Price
correlations <- cor(numeric_cols, use = "complete.obs")["Price",]
correlations <- sort(correlations[-which(names(correlations) == "Price")], decreasing = TRUE)
# Visualize
cor_df <- data.frame(Feature = names(correlations), Correlation = correlations)
ggplot(cor_df, aes(x = Correlation, y = fct_reorder(Feature, Correlation))) +
geom_col(fill = "lightblue") +
labs(title = "Correlation of Numerical Features with Price",
x = "Correlation with Price", y = "Feature") +
theme_minimal()
# Categorical Feature: Brand Frequency
# Top 20 most frequent brands
brand_count <- train %>%
group_by(Brand) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(20)
ggplot(brand_count, aes(x = reorder(Brand, -count), y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 20 Most Frequent Brands", x = "Brand", y = "Count")
# Capacity vs. Price
# Sample a subset for clearer visualization
set.seed(42)
train_sample <- train[sample(nrow(train), 5000), ]
ggplot(train_sample, aes(x = Weight.Capacity..kg., y = Price)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
ggtitle("Weight Capacity vs. Price (Sampled 5k)")
# Brand vs. Price
# Boxplot: Brand vs. Price
ggplot(train, aes(x = Brand, y = Price)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Brand vs. Price")
# Boxplot: Material vs. Price
ggplot(train, aes(x = Material, y = Price)) +
geom_boxplot() +
ggtitle("Material vs. Price")
# Boxplot: Laptop.Compartment vs. Price
ggplot(train, aes(x = Laptop.Compartment, y = Price)) +
geom_boxplot() +
ggtitle("Laptop Compartment vs. Price")
train$Brand_Material <- paste(train$Brand, train$Material, sep = "_")
top_combo <- train %>%
group_by(Brand_Material) %>%
summarise(count = n()) %>%
arrange(desc(count))
ggplot(head(top_combo, 20), aes(x = reorder(Brand_Material, count), y = count)) +
geom_bar(stat = "identity", fill = "lightgreen") +
coord_flip() +
labs(title = "Top 20 Brand × Material Combinations", x = "Brand_Material", y = "Count")
set.seed(42)
sampled <- train %>% sample_n(5000)
ggplot(sampled, aes(x = Weight.Capacity..kg., y = Price, color = Brand)) +
geom_point(alpha = 0.5) +
theme(legend.position = "none") +
labs(title = "Weight Capacity vs Price Colored by Brand")
train$Brand_WeightCombo <- paste(train$Brand, round(train$Weight.Capacity..kg.), sep = "_")
train$Material_Laptop <- paste(train$Material, train$Laptop.Compartment, sep = "_")
material_laptop_price <- train %>%
group_by(Material_Laptop) %>%
summarise(mean_price = mean(Price, na.rm = TRUE)) %>%
arrange(desc(mean_price))
ggplot(head(material_laptop_price, 20), aes(x = reorder(Material_Laptop, mean_price), y = mean_price)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() +
labs(title = "Top 20 Material × LaptopFeature Importance Avg Prices", x = "Material × Laptop Compartment", y = "Average Price")
# Compartment importance within same brand
sub_train <- train %>% filter(Brand == "Jansport")
ggplot(sub_train, aes(x = Weight.Capacity..kg., y = Price)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Within Jansport: Weight Capacity vs Price")
ggplot(sub_train, aes(x = Compartments, y = Price)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "red") +
labs(title = "Within Jansport: Compartments vs Price")
the trend lines (blue and red) are almost horizontal with no apparent
slope, the correlation is very weak.
train_model <- train %>%
select(Compartments, `Weight.Capacity..kg.`, Brand, Material, Size, Laptop.Compartment, Waterproof, Style, Color, Price) %>%
na.omit()
X <- model.matrix(Price ~ . -1, data = train_model)
y <- train_model$Price
# set para
dtrain <- lgb.Dataset(data = X, label = y)
params <- list(objective = "regression", metric = "rmse")
model <- lgb.train(params, dtrain, nrounds = 100)
## [LightGBM] [Warning] Found whitespace in feature_names, replace with underlines
## [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045109 seconds.
## You can set `force_col_wise=true` to remove the overhead.
## [LightGBM] [Info] Total Bins 318
## [LightGBM] [Info] Number of data points in the train set: 299862, number of used features: 28
## [LightGBM] [Info] Start training from score 81.419736
importance <- lgb.importance(model)
print(importance)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: Weight.Capacity..kg. 0.40490899 0.309545919 0.35433333
## 2: Compartments 0.12232589 0.078358384 0.13633333
## 3: MaterialLeather 0.03004754 0.056453124 0.02200000
## 4: MaterialNylon 0.02739180 0.027698039 0.02533333
## 5: WaterproofYes 0.02539792 0.018337336 0.02200000
## 6: WaterproofNo 0.02308945 0.036453774 0.02466667
## 7: ColorGray 0.02163621 0.027349327 0.02233333
## 8: Laptop.CompartmentNo 0.02028806 0.010661375 0.02066667
## 9: StyleTote 0.02016635 0.007396463 0.02366667
## 10: BrandUnder_Armour 0.01954133 0.023618748 0.01933333
## 11: StyleMessenger 0.01940521 0.006768977 0.02433333
## 12: ColorGreen 0.01935373 0.056373812 0.02066667
## 13: SizeLarge 0.01902882 0.031838092 0.02366667
## 14: BrandAdidas 0.01819296 0.050529276 0.02066667
## 15: ColorRed 0.01788407 0.016432286 0.01900000
## 16: ColorBlack 0.01694886 0.048341202 0.01933333
## 17: BrandPuma 0.01684930 0.005900360 0.01933333
## 18: SizeSmall 0.01638647 0.019204197 0.01933333
## 19: ColorPink 0.01633635 0.008003286 0.01833333
## 20: SizeMedium 0.01589825 0.006400142 0.01500000
## 21: BrandJansport 0.01541843 0.020633310 0.01833333
## 22: Laptop.CompartmentYes 0.01409706 0.015195558 0.01766667
## 23: MaterialCanvas 0.01390546 0.029451614 0.01666667
## 24: ColorBlue 0.01372862 0.042023751 0.01700000
## 25: MaterialPolyester 0.01352541 0.022924345 0.01700000
## 26: Brand 0.01316909 0.014160785 0.01266667
## 27: StyleBackpack 0.01273681 0.008751055 0.01700000
## 28: BrandNike 0.01234155 0.001195463 0.01333333
## Feature Gain Cover Frequency
lgb.plot.importance(importance, top_n = 20)
# Next Steps
Based on the exploratory analysis, the following steps are planned for the modeling phase:
Brand,
Material, Size, Style using
one-hot encoding or target encoding.Waterproof,
Laptop.Compartment) into logical/numeric format.test.csv and submit to
Kaggle.These steps will be iteratively refined based on validation results and model diagnostics.